/*
 * Copyright (c) 2006 Oracle.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/dma-mapping.h>
#include <rdma/rdma_cm.h>

#include "rds.h"
#include "ib.h"

static struct kmem_cache *rds_ib_incoming_slab;
static struct kmem_cache *rds_ib_frag_slab;

static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
{
	rdsdebug("frag %p page %p\n", frag, frag->f_page);
	__free_page(frag->f_page);
	frag->f_page = NULL;
}

static void rds_ib_frag_free(struct rds_page_frag *frag)
{
	rdsdebug("frag %p page %p\n", frag, frag->f_page);
	BUG_ON(frag->f_page != NULL);
	kmem_cache_free(rds_ib_frag_slab, frag);
}

/*
 * We map a page at a time.  It's fragments are posted in order.  This
 * is called in fragment order as the fragments get send completion events.
 * Only the last frag in the page performs the unmapping.
 *
 * It's OK for ring cleanup to call this in whatever order it likes because
 * DMA is not in flight and so we can unmap while other ring entries still
 * hold page references in their frags.
 */
static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
				   struct rds_ib_recv_work *recv)
{
	rdsdebug("recv %p frag %p page %p\n", recv, recv->r_frag,
		 recv->r_frag->f_page);
	BUG_ON(recv->r_sge[0].addr == 0);
	if (recv->r_frag->f_offset == RDS_PAGE_LAST_OFF)
		dma_unmap_page(ic->i_cm_id->device->dma_device,
			       recv->r_sge[0].addr - recv->r_frag->f_offset,
			       PAGE_SIZE, DMA_FROM_DEVICE);
	recv->r_sge[0].addr = 0;
}

void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
{
	struct rds_ib_recv_work *recv;
	u32 i;

	for(i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
		recv->r_ibinc = NULL;
		recv->r_frag = NULL;

		recv->r_wr.next = NULL;
		recv->r_wr.wr_id = i;
		recv->r_wr.sg_list = recv->r_sge;
		recv->r_wr.num_sge = 2;

		recv->r_sge[0].addr = 0;
		recv->r_sge[0].length = RDS_FRAG_SIZE;
		recv->r_sge[0].lkey = ic->i_mr->lkey;

		recv->r_sge[1].addr = ic->i_recv_hdrs_dma +
				      (i * sizeof(struct rds_header));
		recv->r_sge[1].length = sizeof(struct rds_header);
		recv->r_sge[1].lkey = ic->i_mr->lkey;
	}
}

static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
			          struct rds_ib_recv_work *recv)
{
	if (recv->r_ibinc) {
		rds_inc_put(&recv->r_ibinc->ii_inc);
		recv->r_ibinc = NULL;
	}
	if (recv->r_frag) {
		if (recv->r_sge[0].addr)
			rds_ib_recv_unmap_page(ic, recv);
		if (recv->r_frag->f_page)
			rds_ib_frag_drop_page(recv->r_frag);
		rds_ib_frag_free(recv->r_frag);
		recv->r_frag = NULL;
	}
}

void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
{
	u32 i;

	for(i = 0; i < ic->i_recv_ring.w_nr; i++)
		rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);

	if (ic->i_addr) {
		dma_unmap_page(ic->i_cm_id->device->dma_device,
			       ic->i_addr, PAGE_SIZE, DMA_FROM_DEVICE);
		ic->i_addr = 0;
	}
	if (ic->i_frag.f_page)
		rds_ib_frag_drop_page(&ic->i_frag);
}

static int rds_ib_recv_refill_one(struct rds_connection *conn, 
				  struct rds_ib_recv_work *recv,
				  gfp_t kptr_gfp, gfp_t page_gfp)
{
	struct rds_ib_connection *ic = conn->c_transport_data;
	int ret = -ENOMEM;

	if (recv->r_ibinc == NULL) {
		recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
						 kptr_gfp);
		if (recv->r_ibinc == NULL)
			goto out;
		INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
		rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
	}

	if (recv->r_frag == NULL) {
		recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
		if (recv->r_frag == NULL)
			goto out;
		INIT_LIST_HEAD(&recv->r_frag->f_item);
		recv->r_frag->f_page = NULL;
	}

	if (ic->i_frag.f_page == NULL) {
		ic->i_frag.f_page = alloc_page(page_gfp);
		if (ic->i_frag.f_page == NULL)
			goto out;
	}

	if (ic->i_addr == 0) {
		ic->i_addr = dma_map_page(ic->i_cm_id->device->dma_device,
					  ic->i_frag.f_page, 0, PAGE_SIZE,
					  DMA_FROM_DEVICE);
		if (dma_mapping_error(ic->i_addr)) {
			ic ->i_addr = 0;
			goto out;
		}
	}

	/*
	 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
	 * must be called on this recv.  This happens as completions hit
	 * in order or on connection shutdown.
	 */
	recv->r_frag->f_page = ic->i_frag.f_page;
	recv->r_frag->f_offset = ic->i_frag.f_offset;
	recv->r_sge[0].addr = ic->i_addr + ic->i_frag.f_offset;
	get_page(recv->r_frag->f_page);

	if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
		ic->i_frag.f_offset += RDS_FRAG_SIZE;
	} else {
		put_page(ic->i_frag.f_page);
		ic->i_frag.f_page = NULL;
		ic->i_frag.f_offset = 0;
		ic->i_addr = 0;
	}

	ret = 0;
out:
	return ret;
}

/*
 * This tries to allocate and post unused work requests after making sure that
 * they have all the allocations they need to queue received fragments into
 * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
 * pairs don't go unmatched.
 *
 * -1 is returned if posting fails due to temporary resource exhaustion.
 */
int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
		       gfp_t page_gfp)
{
	struct rds_ib_connection *ic = conn->c_transport_data;
	struct rds_ib_recv_work *recv;
	struct ib_recv_wr *failed_wr;
	int ret = 0;
	u32 pos;

	while (!ic->i_wc_err && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {

		recv = &ic->i_recvs[pos];
		ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
		if (ret) {
			ret = -1;
			break;
		}

		/* XXX when can this fail? */
		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
		rdsdebug("recv %p ibinc %p page %p addr %llu ret %d\n", recv,
			 recv->r_ibinc, recv->r_frag->f_page,
			 (unsigned long long)recv->r_sge[0].addr, ret);
		if (ret) {
			ic->i_wc_err = 1;
			printk(KERN_WARNING "RDS/IB: recv post on "
			       "%u.%u.%u.%u returned %d, disconnecting and "
			       "reconnecting\n", NIPQUAD(conn->c_faddr),
			       ret);
			queue_work(rds_wq, &conn->c_down_w);
			ret = -1;
			break;
		}
	}

	if (ret)
		rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
	return ret;
}

void rds_ib_inc_purge(struct rds_incoming *inc)
{
	struct rds_ib_incoming *ibinc;
	struct rds_page_frag *frag;
	struct rds_page_frag *pos;
		
	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
	rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);

	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
		list_del_init(&frag->f_item);
		rds_ib_frag_drop_page(frag);
		rds_ib_frag_free(frag);
	}
}

void rds_ib_inc_free(struct rds_incoming *inc)
{
	struct rds_ib_incoming *ibinc;
		
	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);

	rds_ib_inc_purge(inc);
	rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
	BUG_ON(!list_empty(&ibinc->ii_frags));
	kmem_cache_free(rds_ib_incoming_slab, ibinc);
}

int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
			    size_t size)
{
	struct rds_ib_incoming *ibinc;
 	struct rds_page_frag *frag;
	struct iovec *iov = first_iov;
	unsigned long to_copy;
 	unsigned long frag_off = 0;
	unsigned long iov_off = 0;
	int copied = 0;
	int ret;
	u32 len;

	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
 	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
	len = be32_to_cpu(inc->i_hdr.h_len);

	while (copied < size && copied < len) {
 		if (frag_off == RDS_FRAG_SIZE) {
 			frag = list_entry(frag->f_item.next,
 					  struct rds_page_frag, f_item);
 			frag_off = 0;
		}
		while (iov_off == iov->iov_len) {
			iov_off = 0;
			iov++;
		}

 		to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
		to_copy = min_t(size_t, to_copy, size - copied);
		to_copy = min_t(unsigned long, to_copy, len - copied);

		rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
			 "[%p, %lu] + %lu\n", 
			 to_copy, iov->iov_base, iov->iov_len, iov_off,
			 frag->f_page, frag->f_offset, frag_off);

		/* XXX needs + offset for multiple recvs per page */ 
		ret = rds_page_copy_to_user(frag->f_page,
					    frag->f_offset + frag_off,
					    iov->iov_base + iov_off,
					    to_copy);
		if (ret) {
			copied = ret;
			break;
		}

		iov_off += to_copy;
 		frag_off += to_copy;
		copied += to_copy;
	}

	return copied;
}

/* ic starts out kzalloc()ed */
void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
{
	struct ib_send_wr *wr = &ic->i_ack_wr;
	struct ib_sge *sge = &ic->i_ack_sge;

	sge->addr = ic->i_ack_dma;
	sge->length = sizeof(struct rds_ib_ack);
	sge->lkey = ic->i_mr->lkey;

	wr->sg_list = sge;
	wr->num_sge = 1;
	wr->opcode = IB_WR_SEND;
	wr->wr_id = ~0;
	wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
}

/*
 * You'd think that with reliable IB connections you wouldn't need to ack
 * messages that have been received.  The problem is that IB hardware generates
 * an ack message before it has DMAed the message into memory.  This creates a
 * potential message loss if the HCA is disabled for any reason between when it
 * sends the ack and before the message is DMAed and processed.  This is only a
 * potential issue if another HCA is available for fail-over.
 *
 * When the remote host receives our ack they'll free the sent message from
 * their send queue.  To decrease the latency of this we always send an ack
 * immediately after we've received messages.
 *
 * For simplicity, we only have one ack in flight at a time.  This puts
 * pressure on senders to have deep enough send queues to absorb the latency of
 * a single ack frame being in flight.  This might not be good enough.
 *
 * This is implemented by have a long-lived send_wr and sge which point to a
 * statically allocated ack frame.  This ack wr does not fall under the ring
 * accounting that the tx and rx wrs do.  The QP attribute specifically makes
 * room for it beyond the ring size.  Send completion notices its special
 * wr_id and avoids working with the ring in that case.
 */

static void rds_ib_send_ack(struct rds_ib_connection *ic, u64 seq)
{
	struct ib_send_wr *failed_wr;
	int ret;

	ic->i_ack->a_seq = cpu_to_be64(seq);

	ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
	if (ret)
		rds_ib_stats_inc(s_ib_ack_send_failure);
	else
		rds_ib_stats_inc(s_ib_ack_sent);
}

/*
 * If an ack is already in flight then we record the sequence number of
 * the ack that will be sent immediately as the current ack completes.
 * If ack_next remains 0 when the ack completes then we don't send another
 * ack.  This is pretty sneaky.  There can never be an ack for seq 0 in flight
 * as another ack for seq 0 is attempted.  So we can't confuse a zero ack_next
 * with a desire to send an ack for seq 0.
 *
 * I don't know what sort of concurrency the send and recv completion handlers
 * can experience.  The spinlock is probably overkill.  If the send and recv
 * completion handlers can't execute concurrently then the lock can be 
 * squeezed down to atomic bit ops and barriers.
 */
static void rds_ib_attempt_ack(struct rds_ib_connection *ic, u64 seq)
{
	unsigned long flags;
	int send;

	spin_lock_irqsave(&ic->i_ack_lock, flags);

	if (ic->i_ack_in_flight) {
		ic->i_ack_next = seq;
		send = 0;
	} else {
		ic->i_ack_in_flight = 1;
		ic->i_ack_next = 0;
		send = 1;
	}

	spin_unlock_irqrestore(&ic->i_ack_lock, flags);

	rdsdebug("flight %d next %llu this %llu\n", ic->i_ack_in_flight,
		 (unsigned long long)ic->i_ack_next, (unsigned long long)seq);

	if (send)
		rds_ib_send_ack(ic, seq);
	else
		rds_ib_stats_inc(s_ib_ack_send_delayed);
}

void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
{
	unsigned long flags;
	u64 seq;

	spin_lock_irqsave(&ic->i_ack_lock, flags);

	if (ic->i_ack_next) {
		seq = ic->i_ack_next;
		ic->i_ack_in_flight = 1;
		ic->i_ack_next = 0;
	} else {
		seq = 0;
		ic->i_ack_in_flight = 0;
	}

	spin_unlock_irqrestore(&ic->i_ack_lock, flags);

	rdsdebug("flight %d next %llu sending %llu\n", ic->i_ack_in_flight,
		 (unsigned long long)ic->i_ack_next, (unsigned long long)seq);

	if (seq)
		rds_ib_send_ack(ic, seq);
}

static void rds_ib_recv_ack(struct rds_connection *conn,
			    struct rds_ib_recv_work *recv)
{
	struct rds_ib_ack *ack;
	u64 seq;

	rds_ib_stats_inc(s_ib_ack_received);

	ack = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
	seq = be64_to_cpu(ack->a_seq);
	kunmap_atomic(ack, KM_SOFTIRQ0);

	/*
	 * Usually the frags make their way on to incs and are then freed as
	 * the inc is freed.  We don't go that route, so we have to drop the
	 * page ref ourselves.  We can't just leave the page on the recv
	 * because that confuses the dma mapping of pages and each recv's use
	 * of a partial page.  We can leave the frag, though, it will be
	 * reused.
	 */
	rds_ib_frag_drop_page(recv->r_frag);

	rds_send_drop_acked(conn, seq, NULL);
}

/*
 * Work is posted as a RDS_FRAG_SIZE payload and then a header.  This is
 * done so that we can send fragments without headers and keep the fragments
 * large and aligned.  The sender doesn't pad their fragments so the header
 * will spill into the posted regions just after the fragment.
 *
 * XXX If we were to flip r_page into userspace or the page cache then we'd
 * have to zero the header and possibly the rest of the page.
 */
static void rds_ib_copy_header(struct rds_ib_connection *ic,
			       struct rds_ib_incoming *ibinc,
			       struct rds_ib_recv_work *recv, u32 byte_len)
{
	void *addr;
	u32 start;

	BUG_ON(byte_len < sizeof(struct rds_header));

	/* get the start of the header from the tail of the fragment */
	start = byte_len - sizeof(struct rds_header);
	if (start < RDS_FRAG_SIZE) {
		addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
		memcpy(&ibinc->ii_inc.i_hdr,
		       addr + recv->r_frag->f_offset + start, 
		       min_t(u32, RDS_FRAG_SIZE - start,
				  sizeof(struct rds_header)));
		kunmap_atomic(addr, KM_SOFTIRQ0);
	}

	/* and the rest that might have spilled into the posted header space */
	if (byte_len > RDS_FRAG_SIZE) {
		memcpy(&ibinc->ii_inc.i_hdr,
		       &ic->i_recv_hdrs[recv - ic->i_recvs],
		       byte_len - RDS_FRAG_SIZE);
	}
}

/*
 * It's kind of lame that we're copying from the posted receive pages into
 * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
 * them.  But receiving new congestion bitmaps should be a *rare* event, so
 * hopefully we won't need to invest that complexity in making it more
 * efficient.  By copying we can share a simpler core with TCP which has to
 * copy.
 */
static void rds_ib_cong_recv(struct rds_connection *conn,
			      struct rds_ib_incoming *ibinc)
{
	struct rds_cong_map *map;
	unsigned int map_off;
	unsigned int map_page;
 	struct rds_page_frag *frag;
 	unsigned long frag_off;
	unsigned long to_copy;
	unsigned long copied;
	void *addr;

	/* catch completely corrupt packets */
	if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
		return;

	map = conn->c_fcong;
	map_page = 0;
	map_off = 0;

 	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
	frag_off = 0;

	copied = 0;

	while (copied < RDS_CONG_MAP_BYTES) {
 		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);

		addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
		memcpy((void *)map->m_page_addrs[map_page] + map_off,
		       addr + frag_off, to_copy);
		kunmap_atomic(addr, KM_SOFTIRQ0);

		copied += to_copy;

		map_off += to_copy;
		if (map_off == PAGE_SIZE) {
			map_off = 0;
			map_page++;
		}

		frag_off += to_copy;
 		if (frag_off == RDS_FRAG_SIZE) {
 			frag = list_entry(frag->f_item.next,
 					  struct rds_page_frag, f_item);
 			frag_off = 0;
		}
	}

	rds_cong_map_updated(map);
}

/*
 * Rings are posted with all the allocations they'll need to queue the
 * incoming message to the receiving socket so this can't fail.  It relies
 * on being called in the order that the sender sent in to infer which
 * fragments start messages and so will have a header appended. 
 */
static void rds_ib_process_recv(struct rds_connection *conn,
				struct rds_ib_recv_work *recv, u32 byte_len)
{
	struct rds_ib_connection *ic = conn->c_transport_data;
	struct rds_ib_incoming *ibinc = ic->i_ibinc;

	/* XXX shut down the connection if port 0,0 are seen? */

	rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
		 byte_len);

	/*
	 * If we don't already have an inc on the connection then this
	 * fragment has a header and starts a message.. copy its header
	 * into the inc and save the inc so we can hang upcoming fragments
	 * off its list.
	 */ 
	if (ibinc == NULL) {
		if (byte_len < sizeof(struct rds_header)) {
			if (!ic->i_wc_err) {
				ic->i_wc_err = 1;
				printk(KERN_WARNING "RDS/IB: incoming message "
				       "from %u.%u.%u.%u didn't inclue a "
				       "header, disconnecting and "
				       "reconnecting\n",
				       NIPQUAD(conn->c_faddr));
				queue_work(rds_wq, &conn->c_down_w);
			}
			return;
		}

		ibinc = recv->r_ibinc;
		recv->r_ibinc = NULL;
		ic->i_ibinc = ibinc;

		rds_ib_copy_header(ic, ibinc, recv, byte_len);

		ic->i_recv_data_rem = be32_to_cpu(ibinc->ii_inc.i_hdr.h_len);
		rdsdebug("ic %p ibinc %p rem %u\n", ic, ibinc,
			 ic->i_recv_data_rem);
	}

	list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
	recv->r_frag = NULL;

	if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
		ic->i_recv_data_rem -= RDS_FRAG_SIZE;
	else {
		ic->i_recv_data_rem = 0;
		ic->i_ibinc = NULL;

		if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
			rds_ib_cong_recv(conn, ibinc);
		else
			rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
					  &ibinc->ii_inc, GFP_ATOMIC,
					  KM_SOFTIRQ0);
		rds_inc_put(&ibinc->ii_inc);
	}
}

/*
 * Plucking the oldest entry from the ring can be done concurrently with
 * the thread refilling the ring.  Each ring operation is protected by
 * spinlocks and the transient state of refilling doesn't change the
 * recording of which entry is oldest.
 *
 * This relies on IB only calling one cq comp_handler for each cq so that
 * there will only be one caller of rds_recv_incoming() per RDS connection.
 */
void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
{
	struct rds_connection *conn = context;
	struct rds_ib_connection *ic = conn->c_transport_data;
	struct ib_wc wc;
	struct rds_ib_recv_work *recv;
	u64 next_rx;
	int ret = 0;

	rdsdebug("conn %p cq %p\n", conn, cq);

	rds_ib_stats_inc(s_ib_rx_cq_call);

	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);

	next_rx = conn->c_next_rx_seq;

	while (ib_poll_cq(cq, 1, &wc) > 0 ) {
		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
			 be32_to_cpu(wc.imm_data));
		rds_ib_stats_inc(s_ib_rx_cq_event);

		recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
		rds_ib_recv_unmap_page(ic, recv);

		if (wc.status == IB_WC_SUCCESS && !ic->i_wc_err) {
			if (wc.byte_len == sizeof(struct rds_ib_ack))
				rds_ib_recv_ack(conn, recv);
			else
				rds_ib_process_recv(conn, recv, wc.byte_len);
		}

		rds_ib_ring_free(&ic->i_recv_ring, 1);

		/* We expect errors as the qp is drained during shutdown */
		if (wc.status != IB_WC_SUCCESS && !ic->i_wc_err) {
			ic->i_wc_err = 1;
			printk(KERN_WARNING "RDS/IB: completion on "
			       "%u.%u.%u.%u had status %u, disconnecting and "
			       "reconnecting\n", NIPQUAD(conn->c_faddr),
			       wc.status);
			queue_work(rds_wq, &conn->c_down_w);
		}
	}

	if (conn->c_next_rx_seq != next_rx)
		rds_ib_attempt_ack(ic, conn->c_next_rx_seq - 1);

	/* 
	 * XXX atomic is bad as it drains reserve pools, we should really
	 * do some non-blocking alloc that doesn't touch the pools but
	 * will fail.  Then leave it to the thread to get to reclaim
	 * and alloc.
	 */
	
	/* 
	 * If we fail to refill we assume it's a allocation failure
	 * from our use of GFP_ATOMIC and we want the thread to try again
	 * immediately.  Similarly, if the thread is already trying to
	 * refill we want it to try again immediately as it may have missed
	 * the ring entry we just completed before it released the
	 * i_recv_mutex.
	 */
	if (mutex_trylock(&ic->i_recv_mutex)) {
		if (rds_ib_recv_refill(conn, GFP_ATOMIC,
					 GFP_ATOMIC | __GFP_HIGHMEM))
			ret = -EAGAIN;
		else
			rds_ib_stats_inc(s_ib_rx_refill_from_cq);
		mutex_unlock(&ic->i_recv_mutex);
	} else 
		ret = -EAGAIN;

	if (ret)
		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
}

int rds_ib_recv(struct rds_connection *conn)
{
	struct rds_ib_connection *ic = conn->c_transport_data;
	int ret = 0;

	rdsdebug("conn %p\n", conn);

	/*
	 * If we get a temporary posting failure in this context then
	 * we're really low and we want the caller to back off for a bit.
	 */
	mutex_lock(&ic->i_recv_mutex);
	if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER))
		ret = -ENOMEM;
	else
		rds_ib_stats_inc(s_ib_rx_refill_from_thread);
	mutex_unlock(&ic->i_recv_mutex);

	return ret;
}

int __init rds_ib_recv_init(void)
{
	int ret = -ENOMEM;

	rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
					sizeof(struct rds_ib_incoming),
					0, 0, NULL, NULL);
	if (rds_ib_incoming_slab == NULL)
		goto out;

	rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
					sizeof(struct rds_page_frag),
					0, 0, NULL, NULL);
	if (rds_ib_frag_slab == NULL)
		kmem_cache_destroy(rds_ib_incoming_slab);
	else
		ret = 0;
out:
	return ret;
}

void rds_ib_recv_exit(void)
{
	kmem_cache_destroy(rds_ib_incoming_slab);
	kmem_cache_destroy(rds_ib_frag_slab);
}
